pacman::p_load(sf, tmap,
spdep, tidyverse,
httr, jsonlite,
SpatialAcc, ggstatsplot,
olsrr, corrplot, ggpubr,
GWmodel, tmap,
DT, plotly, patchwork,
ranger, SpatialML,
rsample, Metrics)Take-home Ex 3: Predicting HDB Resale Prices with Geographically Weighted Machine Learning Methods
1.0 Introduction
1.1 The Task
In this take-home exercise, we are required to calibrate a predictive model to predict HDB resale prices between July-September 2024 by using HDB resale transaction records in 2023.
1.2 The Data
Below is the list of data used for this take-home exercise. These data are extracted from data.gov.sg and LTA Data Mall. We will be looking at resale flat that are 4-room, for ease of data manipulation. Similar method could be applied for resale HDB of various size.
Structural factors (From resale data)
Area of the unit
Floor level
Remaining lease
Age of the unit
Locational factors
Proxomity to CBD
Proximity to eldercare
Proximity to foodcourt/hawker centres
Proximity to MRT
Proximity to park
Proximity to good primary school
Proximity to shopping mall
Proximity to supermarket
Numbers of kindergartens within 350m
Numbers of childcare centres within 350m
Numbers of bus stop within 350m
Numbers of primary school within 1km
2.0 Getting Started
2.1 Setting up the environment
2.2 Importing and Wrangling the Data
Importing all the data into the R environment.
2.2.1 Resale HDB Data
Resale data (3-Room Flat), and transaction between Jan 2023 to Sep 2024. Selecting to retain only relevant fields:
Address
Postal Code
Area of the unit
Floor level
Remaining lease
Age of the unit
x-y coordinates (left_join with coordinates extracted through reverse geo-coding with address using onemap API)
resale <- read_csv("data/rawdata/resale.csv") %>%
filter(flat_type == "3 ROOM") %>%
filter(month >= "2023-01" & month <= "2024-09") %>%
mutate(address = paste(block,street_name)) %>%
mutate(remaining_lease_yr = as.integer(
str_sub(remaining_lease, 0, 2)))%>%
mutate(remaining_lease_mth = as.integer(
str_sub(remaining_lease, 9, 11))) %>%
separate(storey_range, into = c("min_storey", "max_storey"), sep = " TO ", convert = TRUE) %>%
mutate(storey_order = as.numeric(min_storey)) %>%
select(month, resale_price, address, floor_area_sqm, storey_order, remaining_lease_yr, remaining_lease_mth)Extracting the coords:
add_list <- sort(unique(resale$address))get_coords <- function(add_list){
# Create a data frame to store all retrieved coordinates
postal_coords <- data.frame()
for (i in add_list){
#print(i)
r <- GET('https://www.onemap.gov.sg/api/common/elastic/search?',
query=list(searchVal=i,
returnGeom='Y',
getAddrDetails='Y'))
data <- fromJSON(rawToChar(r$content))
found <- data$found
res <- data$results
# Create a new data frame for each address
new_row <- data.frame()
# If single result, append
if (found == 1){
postal <- res$POSTAL
lat <- res$LATITUDE
lng <- res$LONGITUDE
new_row <- data.frame(address= i,
postal = postal,
latitude = lat,
longitude = lng)
}
# If multiple results, drop NIL and append top 1
else if (found > 1){
# Remove those with NIL as postal
res_sub <- res[res$POSTAL != "NIL", ]
# Set as NA first if no Postal
if (nrow(res_sub) == 0) {
new_row <- data.frame(address= i,
postal = NA,
latitude = NA,
longitude = NA)
}
else{
top1 <- head(res_sub, n = 1)
postal <- top1$POSTAL
lat <- top1$LATITUDE
lng <- top1$LONGITUDE
new_row <- data.frame(address= i,
postal = postal,
latitude = lat,
longitude = lng)
}
}
else {
new_row <- data.frame(address= i,
postal = NA,
latitude = NA,
longitude = NA)
}
# Add the row
postal_coords <- rbind(postal_coords, new_row)
}
return(postal_coords)
}coords <- get_coords(add_list)write_rds(coords,"data/rds/coords.rds")coords <- read_rds("data/rds/coords.rds")resale_xy <- left_join(resale, coords,
by = "address") %>%
st_as_sf(coords = c("longitude", "latitude"),
crs=4326) %>%
st_transform(crs = 3414)write_rds(resale_xy, "data/rds/resale_xy.rds")resale_xy <- read_rds("data/rds/resale_xy.rds") 2.2.2 Proximity to CBD



Based on scribblemaps.com and google map, we can have a sense of Singapore’s CBD and Central Area. Central Area includes area spanning Orchard, Chinatown, Marina Bay, Marina East, Bras Basah, Rochor and Newton. For the purpose of this exercise, we will be taking Dhoby Ghaut MRT station(1.299866722252685, 103.8454773226203) as the definition of our centroid of the CBD area for distance calculation purpose. The reason for choosing this point is as such:
- Dhoby Ghaut MRT station serves three lines (N-S Line, N-E Line, and Circle Line)
- It is approximately central of the Central Area as per the google map.
Another possible centroid would be City Hall MRT(1.2932052372864624, 103.8519615479051), where it serves 2 main MRT lines (N-S Line and E-W Line) and would be representative of centroid when we consider including the Marina Bay East area as part of the Central Area of Singapore.
Next, we will create the sf object for Dhoby Ghaut MRT station.
cbd_dg <- data.frame(longitude = "103.8454773226203",
latitude = "1.299866722252685") %>%
st_as_sf(coords = c("longitude", "latitude"),
crs=4326) %>%
st_transform(crs = 3414)Then, we will check to ensure that both sf data.frame have the same CRS:
st_crs(cbd_dg) == st_crs(resale_xy) [1] TRUE
Next, we will calculate the distance(in metres) using st_distance and append it back to resale_xy:
distances <- st_distance(cbd_dg, resale_xy)
resale_xy$dist_to_cbd <- as.numeric(distances)2.2.3 Proximity to Eldercare
Since there may be multiple eldercare within the area, we will be using the proximity to the nearest available eldercare facility for the resale HDB. ELDERCARE is in shapefile format, hence, we will use st_read() to extract the file as sf data.frame, and also ensure the EPSG code is 3414 using st_transform().
eldercare <- st_read(dsn = "data/rawdata",
layer = "ELDERCARE") %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `ELDERCARE' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata'
using driver `ESRI Shapefile'
Simple feature collection with 133 features and 18 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 14481.92 ymin: 28218.43 xmax: 41665.14 ymax: 46804.9
Projected CRS: SVY21
st_crs(eldercare)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
As there are multiple eldercares, we would need to first find the distance matrix, then find the minimum distance amongst all the matrix, and lastly, append the minimum dist back to our resale_xy dataframe.
dist_elder <- st_distance(resale_xy,eldercare)
min_distances <- apply(dist_elder, 1, min)
resale_xy$dist_eldercare <- min_distances2.2.4 Proximity to Foodcourt/hawker center
Similar steps are applied to Foodcourt/hawker center.
food <- st_read(dsn = "data/rawdata/HawkerCentresKML.kml") %>%
st_transform(crs = 3414) %>%
select(geometry) Reading layer `HAWKERCENTRE' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata\HawkerCentresKML.kml'
using driver `KML'
Simple feature collection with 125 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6974 ymin: 1.272716 xmax: 103.9882 ymax: 1.449017
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
dist_food <- st_distance(resale_xy,food)
min_distances <- apply(dist_food, 1, min)
resale_xy$dist_food <- min_distances2.2.5 Proximity to MRT
Similar steps are applied to MRT.
mrt <- st_read(dsn = "data/rawdata",
layer = "PassengerPickupBay") %>%
st_transform(crs = 3414) %>%
select("geometry") %>%
st_centroid()Reading layer `PassengerPickupBay' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata'
using driver `ESRI Shapefile'
Simple feature collection with 145 features and 1 field
Geometry type: LINESTRING
Dimension: XY
Bounding box: xmin: 12822.28 ymin: 27597.34 xmax: 43035.91 ymax: 47900.22
Projected CRS: SVY21
dist_mrt <- st_distance(resale_xy,mrt)
min_distances <- apply(dist_mrt, 1, min)
resale_xy$dist_mrt <- min_distances2.2.6 Proximity to Park
Similar steps are applied to Park. As geojson file contains geometry information with Z-dimension (height), we will use st_zm() to remove this dimensions since we do not need this information.
park <- st_read(dsn = "data/rawdata/park.geojson") %>%
st_zm() %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `park' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata\park.geojson'
using driver `GeoJSON'
Simple feature collection with 1685 features and 2 fields
Geometry type: MULTIPOLYGON
Dimension: XY, XYZ
Bounding box: xmin: 103.6836 ymin: 1.201022 xmax: 104.0321 ymax: 1.464108
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
dist_park <- st_distance(resale_xy,park)
min_distances <- apply(dist_park, 1, min)
resale_xy$dist_park <- min_distances2.2.7 Proximity to a good primary school
Since the definition of a “good” primary school differs, we will use the knowledge from concerned community as the gauge for a “good” primary school. For the purpose of this exercise, I adopted the two definitions that is associated with a “good” primary school:
- Special Assistance Plan (SAP) Schools - Cultural Richness in Learning
- Gifted Education Programme (GEP) Schools - Tailoring Education for the Gifted
Based on Creative Campus, the list of “good” primary school are compiled in the list named good_sch, using google map to extract the latitude and longitude.
good_sch <- data.frame(
name = c("Ai Tong School",
"Anglo-Chinese School (Primary)",
"Catholic High School",
"CHIJ St., Nicholas Girls’ School",
"Henry Park Primary School",
"Holy Innocents’ Primary School",
"Hong Wen School",
"Kong Hwa School",
"Maha Bodhi School",
"Maris Stella High School (Primary)",
"Nan Hua Primary School",
"Nanyang Primary School",
"Pei Chun Public School",
"Pei Hwa Presbyterian Primary School",
"Poi Ching School",
"Raffles Girls’ Primary School",
"Red Swastika School",
"Rosyth School",
"St. Hilda’s Primary School",
"Tao Nan School"),
latitude = c(1.3605640181003413,
1.3191507364879236,
1.355277597260772,
1.374247187308568,
1.3170721007183392,
1.366919234532623,
1.3216304962516947,
1.3113035192025317,
1.3286168882975922,
1.3413858298053156,
1.3199812404785924,
1.3207867963363251,
1.3373761205768626,
1.3385314868306721,
1.3580371568487648,
1.3302362732220747,
1.3333982209974653,
1.3731445652271157,
1.3498237347980189,
1.3052062281563859),
longitude = c(103.83272785347252,
103.83577417484831,
103.84457970481196,
103.83412198595342,
103.78415775347251,
103.8935703058735,
103.85710945532114,
103.88815583813025,
103.90150335711749,
103.87764453470902,
103.76203935532115,
103.80855218230802,
103.85576078045922,
103.77617291114346,
103.9356781534726,
103.80616995162373,
103.9341450516237,
103.8747181976503,
103.93610449580149,
103.9114018092948)
) %>%
st_as_sf(coords = c("longitude", "latitude"),
crs=4326) %>%
st_transform(crs = 3414)
write_rds(good_sch, "data/rds/good_sch.rds")dist_good_sch <- st_distance(resale_xy,good_sch)
min_distances <- apply(dist_good_sch, 1, min)
resale_xy$dist_good_sch <- min_distances2.2.8 Proximity to Shopping Mall
Similar steps are applied to Shopping Mall.
mall <- st_read(dsn = "data/rawdata",
layer = "Mall") %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `Mall' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata'
using driver `ESRI Shapefile'
Simple feature collection with 464 features and 8 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 15576.2 ymin: 24936 xmax: 40537.72 ymax: 48239.39
Projected CRS: SVY21
dist_mall <- st_distance(resale_xy,mall)
min_distances <- apply(dist_mall, 1, min)
resale_xy$dist_mall <- min_distances2.2.9 Proximity to Supermarket
Similar steps are applied to Supermarket.
supermarket <- st_read(dsn = "data/rawdata/SupermarketsKML.kml") %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `SUPERMARKETS' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata\SupermarketsKML.kml'
using driver `KML'
Simple feature collection with 526 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6258 ymin: 1.24715 xmax: 104.0036 ymax: 1.461526
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
dist_supermarket <- st_distance(resale_xy,supermarket)
min_distances <- apply(dist_supermarket, 1, min)
resale_xy$dist_supermarket <- min_distances2.2.10 Creating buffer of 350m and 1km for resale HDB
buffer_350m <- st_buffer(resale_xy,
dist = 350)buffer_1km <- st_buffer(resale_xy,
dist = 1000)Importing data for kindergartens, childcare, bus stop, and primary school:
kindergarten <- st_read(dsn = "data/rawdata/Kindergartens.kml") %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `KINDERGARTENS' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata\Kindergartens.kml'
using driver `KML'
Simple feature collection with 448 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6887 ymin: 1.247759 xmax: 103.9717 ymax: 1.455452
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
childcare <- st_read(dsn = "data/rawdata/ChildCareServices.kml") %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `CHILDCARE' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata\ChildCareServices.kml'
using driver `KML'
Simple feature collection with 1925 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6878 ymin: 1.247759 xmax: 103.9897 ymax: 1.462134
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
bus <- st_read(dsn = "data/rawdata",
layer = "BusStop") %>%
st_transform(crs = 3414) %>%
select(geometry)Reading layer `BusStop' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata'
using driver `ESRI Shapefile'
Simple feature collection with 5166 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3970.122 ymin: 26482.1 xmax: 48285.52 ymax: 52983.82
Projected CRS: SVY21
pri_sch <- st_read(dsn = "data/rawdata/LTASchoolZoneKML.kml") %>%
st_transform(crs = 3414)Reading layer `SCHOOLZONE' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata\LTASchoolZoneKML.kml'
using driver `KML'
Simple feature collection with 211 features and 2 fields
Geometry type: MULTIPOLYGON
Dimension: XY, XYZ
Bounding box: xmin: 103.687 ymin: 1.272736 xmax: 103.9668 ymax: 1.457587
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
Since the primary school I have got is the a school zone, we will use st_centroid() to find the centroid of each school’s location, and use it to calculate distance. We will also need to drop the Z-dimension using st_zm(). Also, to make the name of primary school more readable, we will use sub() of base R to extract the name of the primary school.
pri_sch <- pri_sch %>%
st_zm() %>%
st_centroid()
pri_sch$Description <- sub(".*<th>SITENAME</th> <td>(.*?)</td>.*", "\\1", pri_sch$Description)
write_rds(pri_sch, "data/rds/pri_sch.rds")2.2.11 Counting the numbers of facilities within the buffers
Counting points for kindergartens, childcare, bus stop, and primary school:
resale_xy$kinder_count <- lengths(
st_intersects(buffer_350m, kindergarten))
resale_xy$childcare_count <- lengths(
st_intersects(buffer_350m, childcare))
resale_xy$bus_count <- lengths(
st_intersects(buffer_350m, bus))
resale_xy$pri_sch_count <- lengths(
st_intersects(buffer_1km, pri_sch))write_rds(resale_xy, "data/rds/resale_xy.rds")2.2.12 Importing the Singapore Boundary
mpsz = st_read(dsn = "data/rawdata/",
layer = "MP14_SUBZONE_WEB_PL") %>%
st_transform(crs = 3414)Reading layer `MP14_SUBZONE_WEB_PL' from data source
`C:\rydialiang\isss626-aug24\Take-home Exercise\Take-home_Ex03\data\rawdata'
using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
3.0 Exploratory Data Analysis (EDA)
3.1 HDB Resale Price
resale_xy <- read_rds("data/rds/resale_xy.rds")ggplot(data=resale_xy,
aes(x=`resale_price`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
theme_minimal()
From this histogram, we can see the resale price is right skewed, with right tail as high as in the range of $1.568 million, which is quite a high price for a 3-Room HDB in Singapore. The peak of the histogram is around $400,000, suggesting the resale price median and mean price is around this value.
Let’s take a look at the outliers using the ggplotly.
p <- ggplot(data=resale_xy,
aes(y=`resale_price`)) +
geom_boxplot(color="black",
fill="light blue") +
theme_minimal()
ip <- ggplotly(p)
ipFrom the boxplot, we observe that there’s one low outlier price of $150K, and many high outliers above $590K (upper quartile).
Next we will take a look at the data above $590K to determine how we want to treat these outliers.
resale_above590k <- resale_xy[resale_xy$resale_price > 590000, ]
datatable(resale_above590k)Investigating the relationship between the resale_price and floor_area_sqm:
ggplot(data = resale_xy,
aes(x = floor_area_sqm,
y = resale_price)) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Adds a trendline
labs(
title = "Scatter Plot of Resale Price vs Floor Area with Trendline",
x = "Floor Area (sqm)",
y = "Resale Price"
) +
theme_minimal()
resale_final <- resale_xy %>%
filter(resale_price <= 1000000 & floor_area_sqm <= 100) %>% filter(resale_price != 150000)Lower Outlier of Price $150k will be dropped.
Higher Outliers of location at JLN MA’MOR, JLN BAHAGIA, STIRLING RD seems to have very high resale price and also much higher floor_area_sqm compared to the 60-70 sqm range of a normal 3-Room flat, as well as a remaining_lease_yr in 40 plus years range.

For the high outliers, we will drop the data that has a resale value above $1mil, more than 100 sqm.
a <- ggplot(data=resale_final,
aes(x=`resale_price`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
theme_minimal()ggplot(data = resale_final,
aes(x = floor_area_sqm,
y = resale_price)) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Adds a trendline
labs(
title = "Scatter Plot of Resale Price vs Floor Area with Trendline",
x = "Floor Area (sqm)",
y = "Resale Price"
) +
theme_minimal()
p <- ggplot(data=resale_final,
aes(y=`resale_price`)) +
geom_boxplot(color="black",
fill="light blue") +
theme_minimal()
ip <- ggplotly(p)
ipresale_final <- resale_final %>%
mutate(`log_resale_price` = log(resale_price))
b <- ggplot(data=resale_final, aes(x=`log_resale_price`)) +
geom_histogram(bins=20, color="black", fill="light blue") +
theme_minimal()After applying log to the resale_price, we get a distribution that resemble the
a/b
write_rds(resale_final,"data/rds/resale_final.rds")3.2 Drawing Statistical Point Map
tm_shape(mpsz) +
tm_polygons() +
tm_shape(resale_final) +
tm_dots(col = "resale_price",
alpha = 0.6,
style = "quantile",
palette = "Reds") +
tm_view(set.zoom.limits = c(11, 14))
tmap_mode("view")
tm_shape(resale_final) +
tm_dots(col = "resale_price",
alpha = 0.6,
style = "quantile",
palette = "Reds") tmap_mode("plot")- Note that Punggol area has a high concentration of high resale price for 3-room HDB. These are likely newly MOP 3-room flats.
4.0 Hedonic Pricing Model for Resale HDB
resale_final <- read_rds("data/rds/resale_final.rds") %>%
drop_na()4.1 Multiple Linear Regression Model
4.1.1 Check Multi-colinearity
resale_final <- resale_final %>%
mutate(across(c(4, 5, 6, 10:21), as.numeric)) %>%
as.data.frame()
corrplot(cor(resale_final[, c(4,5,6,10:22)]),
diag = FALSE, order = "AOE",
tl.pos = "td", tl.cex = 0.8,
tl.srt = 45,
number.cex = 0.5,
method = "number", type = "upper") 
All correlations values are below 0.8, hence we can conclude that there is no multi-colinearity.
4.2 Splitting the Training data and Test data
We will use Jan 2023 to Jun 2024 data as training dataset, and use ground truth of Jul-Sep 2024 to check the performance of the model.
train_data <- resale_final %>%
drop_na() %>%
st_as_sf() %>%
filter(month >= "2023-01" & month <= "2024-06") %>%
st_jitter(amount=5)
test_data <- resale_final %>%
drop_na() %>%
st_as_sf() %>%
filter(month >= "2024-07" & month <= "2024-09") %>%
st_jitter(amount=5)write_rds(train_data, "data/rds/train_data.rds")
write_rds(test_data, "data/rds/test_data.rds")4.3 Building a non-spatial multiple linear regression
price_mlr <- lm(resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd +
dist_mrt +dist_park +dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
data=train_data)
olsrr::ols_regress(price_mlr) Model Summary
--------------------------------------------------------------------------
R 0.855 RMSE 45791.365
R-Squared 0.732 MSE 2100144522.151
Adj. R-Squared 0.731 Coef. Var 10.991
Pred R-Squared 0.731 AIC 216848.685
MAE 33880.493 SBC 216955.129
--------------------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
AIC: Akaike Information Criteria
SBC: Schwarz Bayesian Criteria
ANOVA
-------------------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
-------------------------------------------------------------------------------
Regression 5.106681e+13 13 3.928216e+12 1870.45 0.0000
Residual 1.870809e+13 8908 2100144522.151
Total 6.97749e+13 8921
-------------------------------------------------------------------------------
Parameter Estimates
---------------------------------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
---------------------------------------------------------------------------------------------------------------
(Intercept) -122456.293 6835.800 -17.914 0.000 -135856.036 -109056.550
floor_area_sqm 5056.433 94.427 0.304 53.549 0.000 4871.335 5241.531
remaining_lease_yr 4064.605 36.006 0.770 112.888 0.000 3994.025 4135.184
storey_order 3696.301 96.552 0.229 38.283 0.000 3507.037 3885.566
dist_to_cbd -9.132 0.182 -0.436 -50.292 0.000 -9.488 -8.776
dist_mrt -9.299 0.969 -0.057 -9.591 0.000 -11.199 -7.398
dist_park 18.059 4.988 0.020 3.620 0.000 8.281 27.836
dist_good_sch -1.216 0.291 -0.033 -4.173 0.000 -1.788 -0.645
dist_mall -8.449 0.494 -0.103 -17.092 0.000 -9.418 -7.480
dist_supermarket 19.553 2.693 0.042 7.262 0.000 14.275 24.831
kinder_count 5744.346 660.370 0.058 8.699 0.000 4449.868 7038.824
childcare_count -2292.496 296.135 -0.051 -7.741 0.000 -2872.988 -1712.003
bus_count 1173.726 192.652 0.035 6.092 0.000 796.084 1551.369
pri_sch_count 2136.725 332.939 0.040 6.418 0.000 1484.088 2789.362
---------------------------------------------------------------------------------------------------------------
All predictors have p-value lesser than alpha = 0.05, meaning all are statistically significant, except eldercare, with a p-value of 0.284436.
Adjusted R-squared is 0.7315, suggesting a strong model with 73.15% of the variance in the resale prices is explained by the predictors.
The model also has p-value lesser than alpha = 0.05, suggesting that this model is statistically significant.
write_rds(price_mlr, "data/model/price_mlr.rds" ) 4.4 GWR Predictive Model
train_data_sp <- as_Spatial(train_data)
# Extract coordinates from the SpatialPointsDataFrame
coords <- coordinates(train_data_sp)
# Check for duplicates in the coordinates
duplicate_indices <- which(duplicated(coords))
# Print duplicate coordinates and their indices
if (length(duplicate_indices) > 0) {
cat("Duplicate points found at indices:\n")
print(duplicate_indices)
print(coords[duplicate_indices, ])
} else {
cat("No duplicate points found.")
}No duplicate points found.
test_data_sp <- as_Spatial(test_data)
# Extract coordinates from the SpatialPointsDataFrame
coords <- coordinates(test_data_sp)
# Check for duplicates in the coordinates
duplicate_indices <- which(duplicated(coords))
# Print duplicate coordinates and their indices
if (length(duplicate_indices) > 0) {
cat("Duplicate points found at indices:\n")
print(duplicate_indices)
print(coords[duplicate_indices, ])
} else {
cat("No duplicate points found.")
}No duplicate points found.
plot(train_data_sp, col = 'blue', pch = 16, main = "Training and Test Points")
points(test_data_sp, col = 'red', pch = 16)
legend("topright", legend = c("Train", "Test"), col = c("blue", "red"), pch = 16)
4.4.1 Computing Adaptive Bandwidth
bw_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd +
dist_mrt +dist_park +dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
data=train_data_sp,
approach="CV",
kernel="gaussian",
adaptive=TRUE,
longlat=FALSE)Take a cup of tea and have a break, it will take a few minutes.
-----A kind suggestion from GWmodel development group
Adaptive bandwidth: 5521 CV score: 1.731441e+13
Adaptive bandwidth: 3420 CV score: 1.621879e+13
Adaptive bandwidth: 2120 CV score: 1.463892e+13
Adaptive bandwidth: 1318 CV score: 1.255092e+13
Adaptive bandwidth: 821 CV score: 1.09429e+13
Adaptive bandwidth: 515 CV score: 9.593999e+12
Adaptive bandwidth: 324 CV score: 8.517441e+12
Adaptive bandwidth: 208 CV score: 7.754828e+12
Adaptive bandwidth: 134 CV score: 6.98176e+12
Adaptive bandwidth: 90 CV score: 6.343448e+12
Adaptive bandwidth: 61 CV score: 5.761976e+12
Adaptive bandwidth: 45 CV score: 5.505589e+12
Adaptive bandwidth: 33 CV score: 1.088058e+18
Adaptive bandwidth: 50 CV score: 5.56784e+12
Adaptive bandwidth: 39 CV score: 1.633367e+13
Adaptive bandwidth: 46 CV score: 5.515663e+12
Adaptive bandwidth: 41 CV score: 8.61063e+12
Adaptive bandwidth: 43 CV score: 6.725076e+12
Adaptive bandwidth: 41 CV score: 8.61063e+12
Adaptive bandwidth: 42 CV score: 7.145617e+12
Adaptive bandwidth: 41 CV score: 8.61063e+12
Adaptive bandwidth: 41 CV score: 8.61063e+12
Adaptive bandwidth: 40 CV score: 8.158887e+12
Adaptive bandwidth: 40 CV score: 8.158887e+12
Adaptive bandwidth: 39 CV score: 1.633367e+13
Adaptive bandwidth: 39 CV score: 1.633367e+13
Adaptive bandwidth: 38 CV score: 6.963413e+26
Adaptive bandwidth: 38 CV score: 6.963413e+26
Adaptive bandwidth: 37 CV score: 1.267651e+32
Adaptive bandwidth: 37 CV score: 1.267651e+32
Adaptive bandwidth: 36 CV score: 5.745089e+13
Adaptive bandwidth: 36 CV score: 5.745089e+13
Adaptive bandwidth: 35 CV score: 7.005336e+13
Adaptive bandwidth: 35 CV score: 7.005336e+13
Adaptive bandwidth: 34 CV score: 4.815917e+13
Adaptive bandwidth: 34 CV score: 4.815917e+13
Adaptive bandwidth: 33 CV score: 1.088058e+18
Adaptive bandwidth: 33 CV score: 1.088058e+18
Adaptive bandwidth: 32 CV score: 6.963413e+28
Adaptive bandwidth: 32 CV score: 6.963413e+28
write_rds(bw_adaptive, "data/model/bw_adaptive.rds")4.4.2 Constructing the adaptive bandwidth gwr model
bw_adaptive <- read_rds("data/model/bw_adaptive.rds")The result shows that 45 neighbour points will be the optimal bandwidth to be used if adaptive bandwidth is used for this data set.
bw_adaptive[1] 45
Using the derived adaptive bandwidth for gwr-based pricing model
gwr_adaptive <- gwr.basic(formula = resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd +
dist_mrt +dist_park +dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
data=train_data_sp,
bw=bw_adaptive,
kernel = 'gaussian',
adaptive=TRUE,
longlat = FALSE)write_rds(gwr_adaptive, "data/model/gwr_adaptive.rds")4.4.3 Retrieving gwr output object
gwr_adaptive <- read_rds("data/model/gwr_adaptive.rds")gwr_adaptive ***********************************************************************
* Package GWmodel *
***********************************************************************
Program starts at: 2024-11-08 22:04:00.060262
Call:
gwr.basic(formula = resale_price ~ floor_area_sqm + +remaining_lease_yr +
storey_order + dist_to_cbd + dist_mrt + dist_park + dist_good_sch +
dist_mall + dist_supermarket + kinder_count + childcare_count +
bus_count + pri_sch_count, data = train_data_sp, bw = bw_adaptive,
kernel = "gaussian", adaptive = TRUE, longlat = FALSE)
Dependent (y) variable: resale_price
Independent variables: floor_area_sqm remaining_lease_yr storey_order dist_to_cbd dist_mrt dist_park dist_good_sch dist_mall dist_supermarket kinder_count childcare_count bus_count pri_sch_count
Number of data points: 8922
***********************************************************************
* Results of Global Regression *
***********************************************************************
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-184629 -28750 -2892 23356 379703
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.224e+05 6.836e+03 -17.913 < 2e-16 ***
floor_area_sqm 5.056e+03 9.443e+01 53.548 < 2e-16 ***
remaining_lease_yr 4.065e+03 3.601e+01 112.888 < 2e-16 ***
storey_order 3.696e+03 9.655e+01 38.283 < 2e-16 ***
dist_to_cbd -9.133e+00 1.816e-01 -50.294 < 2e-16 ***
dist_mrt -9.300e+00 9.695e-01 -9.593 < 2e-16 ***
dist_park 1.798e+01 4.987e+00 3.606 0.000313 ***
dist_good_sch -1.216e+00 2.915e-01 -4.172 3.05e-05 ***
dist_mall -8.449e+00 4.943e-01 -17.092 < 2e-16 ***
dist_supermarket 1.957e+01 2.693e+00 7.268 3.95e-13 ***
kinder_count 5.745e+03 6.604e+02 8.700 < 2e-16 ***
childcare_count -2.292e+03 2.961e+02 -7.740 1.10e-14 ***
bus_count 1.174e+03 1.927e+02 6.093 1.15e-09 ***
pri_sch_count 2.137e+03 3.329e+02 6.418 1.45e-10 ***
---Significance stars
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 45830 on 8908 degrees of freedom
Multiple R-squared: 0.7319
Adjusted R-squared: 0.7315
F-statistic: 1870 on 13 and 8908 DF, p-value: < 2.2e-16
***Extra Diagnostic information
Residual sum of squares: 1.870814e+13
Sigma(hat): 45796.56
AIC: 216848.7
AICc: 216848.8
BIC: 208169.6
***********************************************************************
* Results of Geographically Weighted Regression *
***********************************************************************
*********************Model calibration information*********************
Kernel function: gaussian
Adaptive bandwidth: 45 (number of nearest neighbours)
Regression points: the same locations as observations are used.
Distance metric: Euclidean distance metric is used.
****************Summary of GWR coefficient estimates:******************
Min. 1st Qu. Median 3rd Qu.
Intercept -3.5086e+07 -6.0826e+05 -9.3356e+04 5.1813e+05
floor_area_sqm -1.0212e+04 2.5946e+03 3.4538e+03 4.4837e+03
remaining_lease_yr -2.5101e+04 1.8887e+03 3.5455e+03 5.3912e+03
storey_order -3.8035e+02 2.0545e+03 2.5576e+03 3.1427e+03
dist_to_cbd -2.4010e+03 -6.7558e+01 1.7922e+00 8.2819e+01
dist_mrt -9.6148e+02 -4.9851e+01 -1.2315e+01 3.7158e+01
dist_park -9.3498e+02 -4.5145e+01 -8.2848e+00 2.5420e+01
dist_good_sch -3.3544e+03 -4.2564e+01 -2.0299e+00 5.5729e+01
dist_mall -2.2828e+03 -7.6823e+01 -2.2997e+01 1.4602e+01
dist_supermarket -6.2412e+02 -3.6264e+01 -6.8819e+00 2.4600e+01
kinder_count -1.4917e+05 -5.3200e+03 2.4367e+02 6.9285e+03
childcare_count -3.3675e+04 -2.8700e+03 -1.1437e+02 1.9870e+03
bus_count -2.7780e+04 -9.7695e+02 5.3470e+02 2.2760e+03
pri_sch_count -3.2768e+06 -4.3014e+03 1.1365e+03 7.2225e+03
Max.
Intercept 2.8589e+07
floor_area_sqm 1.2231e+05
remaining_lease_yr 8.1651e+03
storey_order 6.6459e+03
dist_to_cbd 4.0504e+03
dist_mrt 1.7308e+03
dist_park 2.2689e+02
dist_good_sch 2.3999e+03
dist_mall 1.2744e+03
dist_supermarket 2.0602e+03
kinder_count 1.5951e+05
childcare_count 2.7476e+04
bus_count 1.3722e+04
pri_sch_count 5.7672e+06
************************Diagnostic information*************************
Number of data points: 8922
Effective number of parameters (2trace(S) - trace(S'S)): 1288.516
Effective degrees of freedom (n-2trace(S) + trace(S'S)): 7633.484
AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 206560.7
AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 205226.9
BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 204810.4
Residual sum of squares: 4.535666e+12
R-square value: 0.9349957
Adjusted R-square value: 0.9240217
***********************************************************************
Program stops at: 2024-11-08 22:05:02.417782
- Adjusted R-square is 0.9240217, which indicates that the GWR model is improved from the general multiple linear regression.
- dist_food and dist_eldercare are removed due to them being statistically insignificant
4.4.4 Computing adaptive bandwidth for the test data
bw_adaptive_test <- bw.gwr(resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd + dist_food +
dist_mrt +dist_park +dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
data=test_data_sp,
approach="CV",
kernel="gaussian",
adaptive=TRUE,
longlat=FALSE)bw_adaptive_testAdaptive bandwidth to be used for test data is 25.
4.4.5 Computing predicted values of the test data
gwr_pred <- gwr.predict(
formula = resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd +
dist_mrt + dist_park + dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
data=train_data_sp,
predictdata = test_data_sp,
bw=bw_adaptive,
kernel = 'gaussian',
adaptive=TRUE,
longlat = FALSE)4.5 Random Forest Model
Preparing the coordinates:
coords <- st_coordinates(st_as_sf(resale_final))
coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)coords_train <- write_rds(coords_train, "data/model/coords_train.rds" )
coords_test <- write_rds(coords_test, "data/model/coords_test.rds" )train_data <- train_data %>%
st_drop_geometry()set.seed(1234)
rf <- ranger(resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd +
dist_mrt +dist_park +dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
data=train_data,
num.trees = 100)
rfwrite_rds(rf, "data/model/rf.rds")rf <- read_rds("data/model/rf.rds")
rfRanger result
Call:
ranger(resale_price ~ floor_area_sqm + +remaining_lease_yr + storey_order + dist_to_cbd + dist_mrt + dist_park + dist_good_sch + dist_mall + dist_supermarket + kinder_count + childcare_count + bus_count + pri_sch_count, data = train_data, num.trees = 100)
Type: Regression
Number of trees: 100
Sample size: 8922
Number of independent variables: 13
Mtry: 3
Target node size: 5
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 586305632
R squared (OOB): 0.9250385
Currently, the number of trees is set to 100.
4.5.1 Calibrating Geographical Random Forest Model
4.5.1.1 Calibrating using training data
set.seed(1234)
gwRF_adaptive <- grf(formula = resale_price ~ floor_area_sqm +
+ remaining_lease_yr + storey_order +
dist_to_cbd +
dist_mrt +dist_park +dist_good_sch +
dist_mall + dist_supermarket +
kinder_count + childcare_count + bus_count +
pri_sch_count,
dframe=train_data,
bw=bw_adaptive,
kernel="adaptive",
coords=coords_train)Saving the model for future retrieval:
write_rds(gwRF_adaptive, "data/model/gwRF_adaptive.rds")gwRF_adaptive <- read_rds("data/model/gwRF_adaptive.rds")4.5.2 Predicting by using test data
The code chunk below will be used to combine the test data with its corresponding coordinates data.
test_data <- cbind(test_data, coords_test) %>%
st_drop_geometry()gwRF_pred <- predict.grf(gwRF_adaptive,
test_data,
x.var.name="X",
y.var.name="Y",
local.w=1,
global.w=0)GRF_pred <- write_rds(gwRF_pred, "data/model/GRF_pred.rds")GRF_pred <- read_rds("data/model/GRF_pred.rds")
GRF_pred_df <- as.data.frame(GRF_pred)
test_data_p <- cbind(test_data, GRF_pred_df)
write_rds(test_data_p, "data/model/test_data_p.rds")4.5.3 Calculating Root Mean Square Error
rmse_test <- rmse(test_data_p$resale_price,
test_data_p$GRF_pred)rmse_test/mean(test_data_p$resale_price)*100[1] 8.64067

- After normalising the RMSE with the mean of the actual resale price, it shows that the normalised RMSE is 8.64% , which indicates that this model is fairly good, as it is lower than 10%.
4.5.4 Visualising the predicted values
g <- ggplot(data = test_data_p,
aes(x = GRF_pred,
y = resale_price)) +
geom_point() +
geom_abline(slope = 1,
intercept = 0,
color = "red",
size = 1.5)
ig <- ggplotly(g)
ig
By comparing the points to the diagonal line, we can conclude that although most of the points follows the diagonal lines, the points are not very close to the line.
Generally, this Geographical Random Forest model tends to predict lower than the actual resale price.
Also, for more expensive resale HDB, most of the predicted value are much lower than the actual resale price.
Lastly, there are outliers (three points below the diagonal lines) that has predicted price exceeding actual resale price by $200,000.